In [1]:
%matplotlib inline
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
In [2]:
data=pd.read_csv('housing.csv')
In [3]:
data.head()
Out[3]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value ocean_proximity
0 -122.23 37.88 41.0 880.0 129.0 322.0 126.0 8.3252 452600.0 NEAR BAY
1 -122.22 37.86 21.0 7099.0 1106.0 2401.0 1138.0 8.3014 358500.0 NEAR BAY
2 -122.24 37.85 52.0 1467.0 190.0 496.0 177.0 7.2574 352100.0 NEAR BAY
3 -122.25 37.85 52.0 1274.0 235.0 558.0 219.0 5.6431 341300.0 NEAR BAY
4 -122.25 37.85 52.0 1627.0 280.0 565.0 259.0 3.8462 342200.0 NEAR BAY
In [4]:
data.shape
Out[4]:
(20640, 10)
In [5]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB
In [6]:
data.isnull().sum()
Out[6]:
longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64
In [7]:
data.total_bedrooms = data.total_bedrooms.fillna(data.total_bedrooms.mode()[0])
In [8]:
numeric_columns= data.select_dtypes(exclude='object')
In [9]:
for col in numeric_columns:
    fig, axs = plt.subplots(1,3,figsize=(15,5))
    sns.histplot(data=data, x=col, bins=30,kde=True, ax = axs[0])
    sns.boxplot(x=data[col], ax=axs[1])
    sns.barplot(data= data[col].describe().reset_index(), x='index', y=col, ax=axs[2])
In [10]:
def find_boundry(df,variable):
    Q1 =df[variable].quantile(.25)
    Q3 =df[variable].quantile(.75)
    IQR = Q3 - Q1
    lower_boundry = Q1 - 1.5*IQR
    upper_boundry = Q3 + 1.5*IQR
    
    return lower_boundry ,upper_boundry
def treat_outliers(df,variable):
    lower_boundry ,upper_boundry = find_boundry(df,variable)
    df[variable] = np.where(df[variable] > upper_boundry, upper_boundry, df[variable])
    df[variable] = np.where(df[variable] < lower_boundry, lower_boundry, df[variable])
    return df.head()
In [11]:
treat_outliers(data,'total_rooms')
treat_outliers(data,'total_bedrooms')
treat_outliers(data,'population')
treat_outliers(data,'households')
treat_outliers(data,'median_income')
treat_outliers(data,'median_house_value')
Out[11]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value ocean_proximity
0 -122.23 37.88 41.0 880.000 129.0 322.0 126.0 8.013025 452600.0 NEAR BAY
1 -122.22 37.86 21.0 5698.375 1106.0 2401.0 1092.5 8.013025 358500.0 NEAR BAY
2 -122.24 37.85 52.0 1467.000 190.0 496.0 177.0 7.257400 352100.0 NEAR BAY
3 -122.25 37.85 52.0 1274.000 235.0 558.0 219.0 5.643100 341300.0 NEAR BAY
4 -122.25 37.85 52.0 1627.000 280.0 565.0 259.0 3.846200 342200.0 NEAR BAY
In [12]:
for col in numeric_columns:
    fig, axs = plt.subplots(1,3,figsize=(15,5))
    sns.histplot(data=data, x=col, bins=30,kde=True, ax = axs[0])
    sns.boxplot(x=data[col], ax=axs[1])
    sns.barplot(data= data[col].describe().reset_index(), x='index', y=col, ax=axs[2])
In [13]:
plt.figure(figsize=(20,20))
sns.pairplot(data)
plt.show()
<Figure size 2000x2000 with 0 Axes>
In [14]:
plt.figure(figsize=(8,8))
sns.heatmap(data.corr(),annot=True)
Out[14]:
<AxesSubplot:>
In [15]:
data.head()
Out[15]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value ocean_proximity
0 -122.23 37.88 41.0 880.000 129.0 322.0 126.0 8.013025 452600.0 NEAR BAY
1 -122.22 37.86 21.0 5698.375 1106.0 2401.0 1092.5 8.013025 358500.0 NEAR BAY
2 -122.24 37.85 52.0 1467.000 190.0 496.0 177.0 7.257400 352100.0 NEAR BAY
3 -122.25 37.85 52.0 1274.000 235.0 558.0 219.0 5.643100 341300.0 NEAR BAY
4 -122.25 37.85 52.0 1627.000 280.0 565.0 259.0 3.846200 342200.0 NEAR BAY
In [16]:
#Using LabelEncoder To transform the data
from sklearn.preprocessing import LabelEncoder
l=LabelEncoder()
for i in data.columns:
    if data[i].dtype == 'object':
        data[i]=l.fit_transform(data[i])

Feature Engineering¶

In [17]:
data['room_per_household'] =data.total_rooms/data.households
data['avg_bedrooms']=data.total_bedrooms/data.total_rooms
data['population/households'] =data.households/data.population
data['total_area_of_rooms'] =data.total_rooms+data.total_bedrooms
In [18]:
data.shape
Out[18]:
(20640, 14)

Modeling¶

In [19]:
plt.figure(figsize=(16,10))
sns.heatmap(data.corr(), annot=True, cmap="YlGnBu")
Out[19]:
<AxesSubplot:>
In [20]:
from sklearn.model_selection import train_test_split
In [21]:
features = data.drop('median_house_value',axis=1)
target = data['median_house_value']
In [22]:
X_train,X_test,y_train,y_test = train_test_split(features,target,test_size=0.2 ,random_state=0)
In [23]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape
Out[23]:
((16512, 13), (4128, 13), (16512,), (4128,))
In [24]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsRegressor 
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor 
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_absolute_error
from lightgbm import LGBMRegressor
from xgboost import XGBRFRegressor
In [25]:
def regressor_rmse(X_train, y_train, X_test, y_test):
    for model in [LinearRegression, SVR, Lasso, Ridge, KNeighborsRegressor]:
        regressor = model()
        regressor.fit(X_train, y_train)
        predictions = regressor.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, predictions))
        print('RMSE for {} is {}'.format(model.__name__, rmse))
In [26]:
#regressor_rmse(X_train, y_train, X_test, y_test)

GridSearchCV¶

In [27]:
from sklearn.model_selection import GridSearchCV
def hyperparameter_tuning(model, train_data, target_data, param_grid):
    # Create the GridSearchCV object
    grid_search = GridSearchCV(model, param_grid, cv=5, return_train_score=True)

    # Fit the model using the training data and target values
    grid_search.fit(train_data, target_data)

    # Get the best estimator from the search
    best_model = grid_search.best_estimator_

    # Return the best hyperparameters from the search and the best model
    return {'best_params': grid_search.best_params_, 'best_model': best_model}
In [28]:
lgbm_model = LGBMRegressor(random_state=101)
In [29]:
param_grid = {
            "n_estimators":[10,100,250,300],
            "min_child_samples" :[20,70,100],
            "num_leaves": [15,50],
            "max_depth": [5, 15],
        }
In [30]:
hyperparameter_tuning(lgbm_model,features,target,param_grid)
Out[30]:
{'best_params': {'max_depth': 15,
  'min_child_samples': 70,
  'n_estimators': 250,
  'num_leaves': 15},
 'best_model': LGBMRegressor(max_depth=15, min_child_samples=70, n_estimators=250,
               num_leaves=15, random_state=101)}
In [31]:
best_lgbm_model = LGBMRegressor(max_depth=15, min_child_samples=70, n_estimators=250,
               num_leaves=15, random_state=101)
In [32]:
best_lgbm_model.fit(X_train,y_train)
Out[32]:
LGBMRegressor(max_depth=15, min_child_samples=70, n_estimators=250,
              num_leaves=15, random_state=101)
In [33]:
test_pred= best_lgbm_model.predict(X_test)
In [34]:
np.sqrt(mean_squared_error(y_test, test_pred))
Out[34]:
44315.336599218215
In [35]:
pd.DataFrame({'actual':y_test, 'predicted': test_pred})
Out[35]:
actual predicted
14740 136900.0 140941.098879
10101 241300.0 261363.085993
20566 200700.0 147172.674029
2670 72500.0 66280.827409
15709 460000.0 465845.150060
... ... ...
6655 169500.0 203877.415146
3505 204600.0 198540.108607
1919 128600.0 133469.431245
1450 259500.0 231089.680272
4148 167600.0 193282.365049

4128 rows × 2 columns

In [36]:
#dt_model = DecisionTreeRegressor()
In [37]:
#param_grid = {
#            "max_depth": [5, 15],
#            "min_samples_leaf": [2, 25],
#            "max_features": [0.1, 0.9]
#   }
In [38]:
#hyperparameter_tuning(dt_model,features,target,param_grid)
In [39]:
#model = DecisionTreeRegressor(max_depth=15, max_features=0.9, min_samples_leaf=25,
#                       random_state=101)
In [40]:
#model.fit(X_train,y_train)
In [41]:
#test_pred = model.predict(X_test)
In [42]:
#model.feature_importances_
In [43]:
#model.feature_names_in_
In [44]:
#pd.DataFrame({"features":model.feature_names_in_ ,"importance":model.feature_importances_}).sort_values("importance",ascending=False)
In [45]:
#np.sqrt(mean_squared_error(y_test, test_pred))
In [46]:
#pd.DataFrame({'actual':y_test, 'predicted': test_pred})

RandomizedSearchCV¶

In [47]:
from sklearn.model_selection import RandomizedSearchCV

def randomSearchCV(estimator, X_train, y_train, param_distributions, n_iter, cv=5, scoring=None, n_jobs=-1, random_state=42):
    # Initializing random search
    random_search = RandomizedSearchCV(estimator=estimator,
                                      param_distributions=param_distributions,
                                      n_iter=n_iter,
                                      cv=cv,
                                      scoring=scoring,
                                      n_jobs=n_jobs,
                                     random_state=random_state)
    # Fit the model using the training data and target values
    random_search.fit(X_train, y_train)
    
    # Get the best estimator from the search
    best_model = random_search.best_estimator_
    
    # Return the best hyperparameters from the search and the best model
    return {'best_params': random_search.best_params_, 'best_model': best_model}
In [48]:
#randomforest_model = RandomForestRegressor()
In [49]:
#param_distributions ={
#            "n_estimators": (10,100, 250),
#            "max_depth": (5, 15),
#            "min_samples_leaf": (2, 25),
#            "max_features": (0.1, 0.9),
#        }
In [50]:
#randomSearchCV(randomforest_model,features,target, param_distributions, n_iter=10, cv=5, scoring=None, n_jobs=-1,
#               random_state=101)
In [51]:
#model =RandomForestRegressor(max_depth=15, max_features=0.9, min_samples_leaf=2,
#                       n_estimators=250, random_state=101)
In [52]:
#model.fit(X_train,y_train)
In [53]:
#test_pred= model.predict(X_test)
In [54]:
#np.sqrt(mean_squared_error(y_test, test_pred))
In [55]:
#pd.DataFrame({"features":model.feature_names_in_ ,"importance":model.feature_importances_}).sort_values("importance",ascending=False)
In [56]:
#pd.DataFrame({'actual':y_test, 'predicted': test_pred})

BayesianOptimization¶

In [57]:
#!pip install bayesian-optimization
In [58]:
#from sklearn.model_selection import cross_val_score
#from bayes_opt import BayesianOptimization

#def rfc_cv(n_estimators, max_depth, min_samples_leaf, max_features, data, targets):
#    estimator = RandomForestRegressor(
#        n_estimators=n_estimators,
#        max_depth=max_depth,
#        min_samples_leaf=min_samples_leaf,
#        max_features=max_features,
#        random_state=121
#    )
#    cval = cross_val_score(estimator, data, targets,
#                           scoring=None, cv=3) 
#    return cval.mean()

#def rfc_crossval(n_estimators, max_depth, min_samples_leaf, max_features):
#        return rfc_cv(
#            n_estimators=int(n_estimators),
#            max_depth=int(max_depth),
#            min_samples_leaf=int(min_samples_leaf),
#            max_features=max(min(max_features, 0.999), 1e-3),
#            data=X_train,
#            targets=y_train,
#        )
        
#optimizer = BayesianOptimization(
#        f=rfc_crossval,
#        pbounds={
#            "n_estimators": (10, 250),
#            "max_depth": (5,15),
#            "min_samples_leaf": (2, 25),
#            "max_features": (0.1, 0.999),
#        },
#        random_state=111,
#        verbose=2
#    )
#optimizer.maximize(n_iter=10)

#best_params = optimizer.max['params']
#best_params['max_depth'] = int(best_params['max_depth'])
#best_params['max_features'] = best_params['max_features']
#best_params['min_samples_leaf'] = int(best_params['min_samples_leaf'])
#best_params['n_estimators'] = int(best_params['n_estimators'])

#best_model = RandomForestRegressor(random_state=121)
#best_model.set_params(**best_params)
#best_model.fit(X_train, y_train)
In [59]:
#model_1 =RandomForestRegressor(max_depth=13, max_features=0.4667195539991278,
#                      min_samples_leaf=14, n_estimators=226, random_state=121)
In [60]:
#model_1.fit(X_train,y_train)
In [61]:
#test_pred= model_1.predict(X_test)
In [62]:
#np.sqrt(mean_squared_error(y_test, test_pred))
In [63]:
#pd.DataFrame({"features":model.feature_names_in_ ,"importance":model.feature_importances_}).sort_values("importance",ascending=False)
In [64]:
#pd.DataFrame({'actual':y_test, 'predicted': test_pred})
In [3]:
datasets = [data]
#titles = []

data_summary = pd.DataFrame({},)
#data_summary['datasets']= titles
data_summary['columns'] = [', '.join([col for col, null in data.isnull().sum().items() ]) for data in datasets]
data_summary['total_rows']= [data.shape[0] for data in datasets]
data_summary['total_cols']= [data.shape[1] for data in datasets]
data_summary['total_duplicate']= [len(data[data.duplicated()]) for data in datasets]
data_summary['total_null']= [data.isnull().sum().sum() for data in datasets]
data_summary['null_cols'] = [', '.join([col for col, null in data.isnull().sum().items() if null > 0]) for data in datasets]
data_summary.style.background_gradient(cmap='YlGnBu')
Out[3]:
  columns total_rows total_cols total_duplicate total_null null_cols
0 longitude, latitude, housing_median_age, total_rooms, total_bedrooms, population, households, median_income, median_house_value, ocean_proximity 20640 10 0 207 total_bedrooms
In [ ]: